# Also do Session > Set Working Directory > Choose Directory
knitr::opts_knit$set(root.dir = ".")
library(BiocParallel) # SnowParam()
library(dplyr) # left_join()
library(edgeR) # DGEList()
library(limma) # plotMDS()
library(ggrepel) # geom_text_repel()
library(ggplot2) # ggplot()
library(gplots) # heatmap.2()
library(grDevices) # colorRampPalette()
library(rtracklayer) # import()
library(stringr) # str_match()
library(variancePartition) # fitExtractVarPartModel()
library(reshape) # melt()
library(Glimma)
library(plyr)
library(corrplot)
library(ggpubr)
sex_karyotype <- "XX"
control <- "CONTROL"
condition <- "LBD"
control_color <- "gray29"
condition_color <- "red"
myContrasts <- c("LBD - CONTROL")
tool = c("star")
pathToRef = c("/research/labs/neurology/fryer/projects/references/human/")
pathToRawData = c("/research/labs/neurology/fryer/projects/LBD_CWOW/")
# read in metadata
metadata <- read.delim(paste0(pathToRawData, "metadata.tsv"),
header = TRUE,
sep = "\t")
RIN <- read.delim(paste0(pathToRawData, "RIN.tsv"),
header = TRUE,
sep = "\t")
RIN$NPID <- RIN$Sample
metadata <- merge(metadata, RIN, by = "NPID")
# read in counts data
counts <- read.delim(
paste0("../../featureCounts/LBD_", sex_karyotype,".counts"),
header = TRUE,
sep = "\t", check.names=FALSE)
# get the sampleID from the counts file to match with metadata
sampleIDs <- colnames(counts)
samples <- as.data.frame(sampleIDs)
# split the sampleID by _
sample_count_id <- as.data.frame(str_split_fixed(samples$sampleID, "_", 2))
sample_count_id$counts_id <- samples$sampleIDs
# rename columns
names(sample_count_id)[names(sample_count_id) == 'V1'] <- 'NPID'
names(sample_count_id)[names(sample_count_id) == 'V2'] <- 'flow_lane'
# merge sample_count_id and metadata files
# this way metadata contains the sampleID to match the counts table
counts_metadata <- merge(metadata, sample_count_id, by = "NPID")
# gene information
counts.geneid <- read.delim(
"../../featureCounts/gene_info.txt",
header = TRUE,
sep = "\t"
)
counts.geneid <- as.data.frame(counts.geneid$Geneid)
colnames(counts.geneid) <- "gene_id"
# add gene_name as row names to counts file
rownames(counts) <- counts.geneid$gene_id
# read in annotation file
gtf.file <- paste0(pathToRef, "gencode.v38.annotation.gtf")
genes.gtf <- rtracklayer::import(gtf.file)
genes.gtf <- as.data.frame(genes.gtf)
genes.gtf <- genes.gtf[genes.gtf$type == "gene",]
table(genes.gtf$gene_type)
##
## IG_C_gene IG_C_pseudogene
## 14 9
## IG_D_gene IG_J_gene
## 37 18
## IG_J_pseudogene IG_pseudogene
## 3 1
## IG_V_gene IG_V_pseudogene
## 145 187
## lncRNA miRNA
## 16888 1879
## misc_RNA Mt_rRNA
## 2212 2
## Mt_tRNA polymorphic_pseudogene
## 22 49
## processed_pseudogene protein_coding
## 10163 19955
## pseudogene ribozyme
## 15 8
## rRNA rRNA_pseudogene
## 47 497
## scaRNA scRNA
## 49 1
## snoRNA snRNA
## 943 1901
## sRNA TEC
## 5 1056
## TR_C_gene TR_D_gene
## 6 4
## TR_J_gene TR_J_pseudogene
## 79 4
## TR_V_gene TR_V_pseudogene
## 106 33
## transcribed_processed_pseudogene transcribed_unitary_pseudogene
## 502 143
## transcribed_unprocessed_pseudogene translated_processed_pseudogene
## 950 2
## translated_unprocessed_pseudogene unitary_pseudogene
## 1 98
## unprocessed_pseudogene vault_RNA
## 2614 1
#counts.geneid
#names(counts.geneid)[1] <- 'gene_name'
joined.df <- join(counts.geneid, genes.gtf, type = "inner")
## Joining by: gene_id
# check columns and rows match up between files
all.equal(rownames(counts), counts.geneid$gene_id)
## [1] TRUE
all.equal(rownames(counts), genes.gtf$gene_id)
## [1] TRUE
# reorder counts to be in the same order as metadata table
counts <- counts[, counts_metadata$counts_id]
all.equal(colnames(counts), (counts_metadata$counts_id))
## [1] TRUE
# create object
dge <- DGEList(counts = counts,
samples = counts_metadata,
genes = genes.gtf)
table(dge$samples$TYPE)
##
## CONTROL CONTROL - AD CONTROL - PA LBD
## 8 8 2 38
# remove AD and PA controls
remove <- vector()
for (i in 1:nrow(dge$samples)) {
if (dge$samples$TYPE[i] == "CONTROL - AD" | dge$samples$TYPE[i] == "CONTROL - PA") {
remove <- c(remove, i)
}
}
dge <- dge[,-remove, keep.lib.sizes = FALSE]
table(dge$samples$TYPE)
##
## CONTROL LBD
## 8 38
column 1 is pid id column 15 is group information (LPS or control) column 18 is date of birth
saveRDS(dge, file = paste0("../../rObjects/", condition, "_", tool,"_", sex_karyotype, "_gene_raw.rds"))
dim(dge)
## [1] 60649 46
removeMT <- dge$genes$seqnames != "chrM" # true when NOT MT
dge <- dge[removeMT,,keep.lib.sizes = FALSE]
dim(dge)
## [1] 60612 46
These functions with help simultaneously save plots as a png, pdf, and tiff file.
saveToPDF <- function(...) {
d = dev.copy(pdf,...)
dev.off(d)
}
saveToPNG <- function(...) {
d = dev.copy(png,...)
dev.off(d)
}
# set colors and get data
table(dge$samples$TYPE)
##
## CONTROL LBD
## 8 38
dge$samples$TYPE <- as.factor(dge$samples$TYPE)
group_colors <- c("black","red")[dge$samples$TYPE]
lcpm <- edgeR::cpm(dge$counts, log = TRUE)
par(bg = 'white')
# plot MDS
plotMDS(
lcpm,
top = 100,
labels = dge$samples$NPID,
cex = 0.8,
dim.plot = c(1,2),
plot = TRUE,
col = group_colors,
gene.selection = "common"
)
title(expression('Top 100 Genes - Raw (Log'[2]~'CPM)'))
#legend(
# "bottomleft",
# legend = c(control, condition),
# pch = 16,
# col = c("black","red"),
# cex = 0.8
#)
path <- paste0("../../results/", tool, "/MDS/", condition,"_gene_MDS_dim1&2_techreps_", sex_karyotype)
saveToPDF(paste0(path, ".pdf"), width = 4, height = 4)
## png
## 2
saveToPNG(paste0(path, ".png"), width = 4, height = 4, unit = "in", res = 300)
## png
## 2
# plot MDS
plotMDS(
lcpm,
top = 100,
labels = dge$samples$NPID,
cex = 0.8,
dim.plot = c(2,3),
plot = TRUE,
col = group_colors,
gene.selection = "common"
)
title(expression('Top 100 Genes - Raw (Log'[2]~'CPM)'))
path <- paste0("../../results/", tool, "/MDS/", condition,"_gene_MDS_dim2&3_techreps_", sex_karyotype)
saveToPDF(paste0(path, ".pdf"), width = 4, height = 4)
## png
## 2
saveToPNG(paste0(path, ".png"), width = 4, height = 4, unit = "in", res = 300)
## png
## 2
# first filter by expression and normalize the data
keep.expr <- filterByExpr(dge, group = dge$samples$TYPE)
dim(dge)
## [1] 60612 46
dge.filtered.reps <- dge[keep.expr, , keep.lib.sizes = FALSE]
dim(dge.filtered.reps)
## [1] 16319 46
table(dge.filtered.reps$genes$gene_type)
##
## IG_C_gene lncRNA
## 1 1942
## miRNA misc_RNA
## 3 5
## polymorphic_pseudogene processed_pseudogene
## 3 121
## protein_coding scaRNA
## 13832 1
## scRNA snoRNA
## 1 3
## snRNA TEC
## 5 91
## TR_C_gene transcribed_processed_pseudogene
## 3 52
## transcribed_unitary_pseudogene transcribed_unprocessed_pseudogene
## 30 196
## unitary_pseudogene unprocessed_pseudogene
## 1 29
# Now, normalization by the method of trimmed mean of M-values (TMM)
dge.filtered.reps.norm <- calcNormFactors(dge.filtered.reps, method = "TMM")
# norm factor summary
summary(dge.filtered.reps.norm$samples$norm.factors)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.6909 0.9265 1.0322 1.0099 1.1152 1.1979
log2cpm.norm.reps <- edgeR::cpm(dge.filtered.reps.norm, log = T)
nsamples <- ncol(dge.filtered.reps.norm)
boxplot(log2cpm.norm.reps,
main="Filtered normalized lcpm data with replicates",
xlab="RIN",
ylab=expression('Counts per gene (Log'[2]~'CPM)'),
axes=FALSE)
axis(2)
axis(1,at=1:nsamples,labels=(dge.filtered.reps.norm$samples$RIN),las=2,cex.axis=0.8)
path <- paste0("../../results/", tool, "/boxplot/", condition, "_gene_boxplot_with_reps_with_RIN_", sex_karyotype)
saveToPDF(paste0(path, ".pdf"), width = 18, height = 6)
## png
## 2
saveToPNG(paste0(path, ".png"), width = 18, height = 6, unit = "in", res = 300)
## png
## 2
# sum technical replicates
dim(dge)
## [1] 60612 46
dge.tech <- sumTechReps(dge, dge$samples$NPID)
dim(dge.tech$counts)
## [1] 60612 23
colnames(dge.tech$counts) <- dge.tech$samples$NPID
# set colors and get data
group_colors <- c("black", "red")[dge.tech$samples$TYPE]
lcpm <- edgeR::cpm(dge.tech$counts, log = TRUE)
par(bg = 'white')
# plot MDS
plotMDS(
lcpm,
top = 100,
labels = dge.tech$samples$NPID,
cex = .8,
dim.plot = c(1,2),
plot = TRUE,
col = group_colors,
gene.selection = "common"
)
title(expression('Top 100 Genes - Raw (Log'[2]~'CPM)'))
#legend(
# "top",
# legend = c(control, condition),
# pch = 16,
# col = c(control_color, condition_color),
# cex = 0.8
#)
# save
path <- paste0("../../results/", tool, "/MDS/", condition, "_gene_MDS_dim1&2_raw_", sex_karyotype)
saveToPDF(paste0(path, ".pdf"), width = 4, height = 4)
## png
## 2
saveToPNG(paste0(path, ".png"), width = 4, height = 4, unit = "in", res = 300)
## png
## 2
# plot MDS
plotMDS(
lcpm,
top = 100,
labels = dge.tech$samples$RIN,
cex = .8,
dim.plot = c(1,2),
plot = TRUE,
col = group_colors,
gene.selection = "common"
)
title(expression('Top 100 Genes - Raw (Log'[2]~'CPM)'))
#legend(
# "top",
# legend = c(control, condition),
# pch = 16,
# col = c(control_color, condition_color),
# cex = 0.8
#)
# save
path <- paste0("../../results/", tool, "/MDS/", condition, "_gene_MDS_dim1&2_raw_", sex_karyotype, "_label_RIN")
saveToPDF(paste0(path, ".pdf"), width = 4, height = 4)
## png
## 2
saveToPNG(paste0(path, ".png"), width = 4, height = 4, unit = "in", res = 300)
## png
## 2
# plot MDS
plotMDS(
lcpm,
top = 100,
labels = dge.tech$samples$RIN,
cex = .8,
dim.plot = c(2,3),
plot = TRUE,
col = group_colors,
gene.selection = "common"
)
title(expression('Top 100 Genes - Raw (Log'[2]~'CPM)'))
path <- paste0("../../results/", tool, "/MDS/", condition, "_gene_MDS_dim2&3_raw_", sex_karyotype, "_label_RIN")
saveToPDF(paste0(path, ".pdf"), width = 4, height = 4)
## png
## 2
saveToPNG(paste0(path, ".png"), width = 4, height = 4, unit = "in", res = 300)
## png
## 2
The filterByExpr() function in the edgeR package determines which genes have a great enough count value to keep. We will filter by group. This means at least 6 samples (6 is the smallest group sample size) must express a minimum count of 10 (in cpm, default value).
keep.expr <- filterByExpr(dge.tech, group = dge.tech$samples$TYPE)
dim(dge.tech)
## [1] 60612 23
dge.filtered <- dge.tech[keep.expr, , keep.lib.sizes = FALSE]
dim(dge.filtered)
## [1] 18389 23
table(dge.filtered$genes$gene_type)
##
## IG_C_gene lncRNA
## 2 2918
## miRNA misc_RNA
## 6 15
## polymorphic_pseudogene processed_pseudogene
## 5 231
## protein_coding scaRNA
## 14584 1
## scRNA snoRNA
## 1 14
## snRNA TEC
## 14 156
## TR_C_gene transcribed_processed_pseudogene
## 4 79
## transcribed_unitary_pseudogene transcribed_unprocessed_pseudogene
## 42 257
## unitary_pseudogene unprocessed_pseudogene
## 1 59
# Now, normalization by the method of trimmed mean of M-values (TMM)
dge.filtered.norm <- calcNormFactors(dge.filtered, method = "TMM")
# norm factor summary
summary(dge.filtered.norm$samples$norm.factors)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.6855 0.9293 1.0383 1.0100 1.1155 1.1967
Density plots of log-intensity distribution of each library can be superposed on a single graph for a better comparison between libraries and for identification of libraries with weird distribution.
# set graphical parameter
par(mfrow = c(1,3))
# Normalize data for library size and expression intesntiy
log2cpm.tech <- edgeR::cpm(dge.tech, log = TRUE)
log2cpm.filtered <- edgeR::cpm(dge.filtered, log = TRUE)
log2cpm.norm <- edgeR::cpm(dge.filtered.norm, log = TRUE)
# set colors
colors <- c("red","orange","green","yellow","blue","purple",
"lightgray","brown","pink","cyan")
nsamples <- ncol(dge.tech)
# First, plot the first column of the log2cpm.tech density
plot(density(log2cpm.tech[,1]), col = colors[1], lwd = 2, ylim = c(0,0.25),
las = 2, main = "A. Raw", xlab = expression('Log'[2]~CPM))
# For each sample plot the lcpm density
for (i in 2:nsamples){
den <- density(log2cpm.tech[,i]) #subset each column
lines(den$x, den$y, col = colors[i], lwd = 2)
}
# Second, plot log2cpm.filtered
plot(density(log2cpm.filtered[,1]), col = colors[1], lwd = 2, ylim = c(0,0.25),
las = 2, main = "B. Filtered", xlab = expression('Log'[2]~CPM))
abline(v = edgeR::cpm(3, log = TRUE), lty = 3)
for (i in 2:nsamples) {
den <- density(log2cpm.filtered[,i])
lines(den$x, den$y, col = colors[i], lwd = 2)
}
# Third, plot log2cpm.norm
plot(density(log2cpm.norm[,1]), col = colors[1], lwd = 2, ylim = c(0,0.25),
las = 2, main = "C. Normalized", xlab = expression('Log'[2]~CPM))
abline(v = edgeR::cpm(3, log = TRUE), lty = 3)
for (i in 2:nsamples) {
den <- density(log2cpm.norm[,i])
lines(den$x, den$y, col = colors[i], lwd = 2)
}
# save
path <- paste0("../../results/", tool, "/density/", condition, "_gene_density_", sex_karyotype)
saveToPDF(paste0(path, ".pdf"), width = 6, height = 4)
## png
## 2
saveToPNG(paste0(path, ".png"), width = 6, height = 4, unit = "in", res = 300)
## png
## 2
# set parameters
par(mfrow = c(1,3))
# First look at dge.tech
boxplot(log2cpm.tech,
main="A. Raw",
xlab="",
ylab=expression('Counts per gene (Log'[2]~'CPM)'),
axes=FALSE,
col = colors
)
axis(2) # 2 = left
axis(1, # 1 = below
at = 1:nsamples, # points at which tick-marks should be drawn
labels = colnames(log2cpm.tech),
las = 2,
cex.axis = 0.8 # size of axis
)
# Second, look at dge.filtered
boxplot(log2cpm.filtered,
main="B. Filtered",
xlab="",
ylab=expression('Counts per gene (Log'[2]~'CPM)'),
axes=FALSE,
col = colors
)
axis(2)
axis(1, at=1:nsamples,labels=colnames(log2cpm.filtered),las=2,cex.axis=0.8)
# Third, look at dge.norm
boxplot(log2cpm.norm,
main="C. Normalized",
xlab="",
ylab=expression('Counts per gene (Log'[2]~'CPM)'),
axes=FALSE,
col = colors)
axis(2)
axis(1,at=1:nsamples,labels=colnames(log2cpm.norm),las=2,cex.axis=0.8)
# save
path <- paste0("../../results/", tool, "/boxplot/", condition, "_gene_boxplot_", sex_karyotype)
saveToPDF(paste0(path, ".pdf"), width = 10, height = 6)
## png
## 2
saveToPNG(paste0(path, ".png"), width = 10, height = 6, unit = "in", res = 300)
## png
## 2
boxplot(log2cpm.norm,
main="Filtered normalized lcpm data",
xlab="RIN",
ylab=expression('Counts per gene (Log'[2]~'CPM)'),
axes=FALSE,
col = colors)
axis(2)
axis(1,at=1:nsamples,labels=(dge.filtered.norm$samples$RIN),las=2,cex.axis=0.8)
path <- paste0("../../results/", tool, "/boxplot/", condition, "_gene_boxplot_sum_reps_", sex_karyotype)
saveToPDF(paste0(path, ".pdf"), width = 12, height = 6)
## png
## 2
saveToPNG(paste0(path, ".png"), width = 12, height = 6, unit = "in", res = 300)
## png
## 2
# check if there is correlation between RIN and library size
box <- dge.filtered.norm$samples
plot(box$RIN, box$lib.size)
cor(box$RIN, box$lib.size, method = c("pearson", "kendall", "spearman"))
## [1] 0.2219294
cor.test(box$RIN, box$lib.size, method=c("pearson", "kendall", "spearman"))
##
## Pearson's product-moment correlation
##
## data: box$RIN and box$lib.size
## t = 1.043, df = 21, p-value = 0.3088
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2094316 0.5809833
## sample estimates:
## cor
## 0.2219294
# is the data normally distrubited
ggqqplot(box$lib.size, ylab = "library size")
# wt
ggqqplot(box$RIN, ylab = "RIN")
res <- cor.test(box$lib.size, box$RIN,
method = "pearson")
res
##
## Pearson's product-moment correlation
##
## data: box$lib.size and box$RIN
## t = 1.043, df = 21, p-value = 0.3088
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2094316 0.5809833
## sample estimates:
## cor
## 0.2219294
ggscatter(box, x = "RIN", y = "lib.size",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
ylab = "library size", xlab = "RIN value")
## `geom_smooth()` using formula 'y ~ x'
path <- paste0("../../results/", tool, "/library/", condition, "_corr_RIN_lib_size_", sex_karyotype)
saveToPDF(paste0(path, ".pdf"), width = 6, height = 6)
## png
## 2
saveToPNG(paste0(path, ".png"), width = 6, height = 6, unit = "in", res = 300)
## png
## 2
#form <- ~ TYPE + Age + Sex + ClinicalDx +lib.size + LBD.type +CDLB
form <- ~ TYPE + PathDx + AD.subtype + LBD.type + CDLB + Braak.NFT + Thal.amyloid + MF.SP + MF.NFT + MF.LB + Cing.LB + MF.Amyloid + MF.Tau + Cing.Synuclein + CWOW.Category + VaD + TDP.type + Brain.wt + ClinicalDx + FHx + Duration + Age + PMI + APOE + MAPT + GRN + TMEM106b + RIN + Total.RNA.ng
# removed Race since there is only white samples in this set.
# Compute Canonical Correlation Analysis (CCA)
# between all pairs of variables
# returns absolute correlation value
info <- as.data.frame(dge.filtered.norm$samples)
C = canCorPairs( form, info)
## Warning in canCorPairs(form, info): Regression model may be problematic.
## High colinearity between variables:
## TYPE and PathDx
## TYPE and LBD.type
## TYPE and CDLB
## TYPE and CWOW.Category
## PathDx and AD.subtype
## PathDx and LBD.type
## PathDx and CDLB
## PathDx and MF.NFT
## PathDx and MF.Tau
## PathDx and Cing.Synuclein
## PathDx and VaD
## MF.SP and CWOW.Category
## MF.Amyloid and CWOW.Category
## MF.Tau and CWOW.Category
## Cing.Synuclein and CWOW.Category
# Plot correlation matrix
#plotCorrMatrix( C , key.xlab = "correlation")
# replace NA with zero
C[is.na(C)] = 0
cor.mtest <- function(C, ...) {
C <- as.matrix(C)
n <- ncol(C)
p.mat<- matrix(NA, n, n)
diag(p.mat) <- 0
for (i in 1:(n - 1)) {
for (j in (i + 1):n) {
tmp <- cor.test(C[, i], C[, j], ...)
p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
}
}
colnames(p.mat) <- rownames(p.mat) <- colnames(C)
p.mat
}
# matrix of the p-value of the correlation
p.mat <- cor.mtest(C)
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(C, method="color", col=col(200),
type="upper", order="hclust",
addCoef.col = "black", # Add coefficient of correlation
tl.col="black", tl.srt=45, #Text label color and rotation
# Combine with significance
# p.mat = p.mat, sig.level = 0.1, insig = "blank",
# hide correlation coefficient on the principal diagonal
diag=FALSE
)
path <- paste0("../../results/", tool ,"/varpart/LBD_gene_CCA", sex_karyotype)
saveToPDF(paste0(path, ".pdf"), width = 25, height = 25)
## png
## 2
saveToPNG(paste0(path, ".png"), width = 23, height = 25, unit = "in", res = 300)
## png
## 2
# CCA for only a handful of variables
form <- ~ TYPE + Brain.wt + Duration + Age + PMI + RIN
# removed Race since there is only white samples in this set.
# Compute Canonical Correlation Analysis (CCA)
# between all pairs of variables
# returns absolute correlation value
info <- as.data.frame(dge.filtered.norm$samples)
C = canCorPairs( form, info)
# Plot correlation matrix
#plotCorrMatrix( C , key.xlab = "correlation")
# replace NA with zero
C[is.na(C)] = 0
cor.mtest <- function(C, ...) {
C <- as.matrix(C)
n <- ncol(C)
p.mat<- matrix(NA, n, n)
diag(p.mat) <- 0
for (i in 1:(n - 1)) {
for (j in (i + 1):n) {
tmp <- cor.test(C[, i], C[, j], ...)
p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
}
}
colnames(p.mat) <- rownames(p.mat) <- colnames(C)
p.mat
}
# matrix of the p-value of the correlation
p.mat <- cor.mtest(C)
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(C, method="color", col=col(200),
type="upper", order="hclust",
addCoef.col = "black", # Add coefficient of correlation
tl.col="black", tl.srt=45, #Text label color and rotation
# Combine with significance
# p.mat = p.mat, sig.level = 0.1, insig = "blank",
# hide correlation coefficient on the principal diagonal
diag=FALSE
)
saveRDS(dge.filtered.norm, file = paste0("../../rObjects/", condition, "_dge.filtered.norm", sex_karyotype, ".rds"))
#dge.filtered.norm <- readRDS( file = paste0("../../rObjects/", condition, "_dge.filtered.norm", sex_karyotype, ".rds"))
register(SnowParam(detectCores())) # work in parallel (takes a while to run)
# geneExpr: matrix of gene expression values
# info: information/metadata about each sample
geneExpr <- as.matrix(dge.filtered.norm$counts)
info <- as.data.frame(dge.filtered.norm$samples)
info$RIN <- as.integer(info$RIN)
info$TYPE <- as.character(info$TYPE)
# age is usually a continuous so model it as a fixed effect "age"
# group is categorical, so model them as random effects "(1|group)"
# note the syntax
form <- ~Age + PMI + RIN + (1|TYPE)# + (1 | CWOW.Category)
#
#(1 | LBD.type) +
#(1 | TDP.type) +
#(1 | ClinicalDx) +
#(1 | FHx) +
#(1 | Sex) +
#(1 | Race) +
#(1 | TMEM106b) +
#Braak.NFT +
#Thal.amyloid +
#MF.SP +
#MF.LB +
#Cing.LB +
#MF.Amyloid +
#MF.Tau +
#Cing.Synuclein +
#VaD +
#TDP.43 +
#Brain.wt +
#Duration +
#Age +
#PMI +
#RIN
varPart <- fitExtractVarPartModel(geneExpr, form, info)
## Warning in .fitExtractVarPartModel(exprObj, formula, data, REML = REML, : Variables contain NA's: PMI
## Samples with missing data will be dropped.
vp <- sortCols(varPart)
# plot
plotVarPart(vp)
# save
path <- paste0("../../results/", tool, "/varpart/", condition, "_gene_varpart_violins_", sex_karyotype)
saveToPDF(paste0(path, ".pdf"), width = 6, height = 4)
## png
## 2
saveToPNG(paste0(path, ".png"), width = 6, height = 4, unit = "in", res = 300)
## png
## 2
# plot
plotPercentBars(vp[1:10,])
# make a column called gene_id to later merge with genes.gtf
vp$gene_id <- row.names(vp)
# dataframe of gene_id and gene_name
gene_id_name <- cbind(genes.gtf$gene_id, genes.gtf$gene_name)
df <- as.data.frame(gene_id_name)
names(df)[names(df) == 'V1'] <- 'gene_id'
names(df)[names(df) == 'V2'] <- 'gene_name'
vp_df <- merge(vp, df)
row.names(vp_df) <- make.names(vp_df$gene_name, unique = TRUE)
vp_df$gene_id <- NULL
vp_df$gene_name <- NULL
plotPercentBars(vp_df[1:10,])
# save
path <- paste0("../../results/", tool, "/varpart/", condition, "_gene_percent_bars", sex_karyotype)
saveToPDF(paste0(path, ".pdf"), width = 6, height = 6)
## png
## 2
saveToPNG(paste0(path, ".png"), width = 6, height = 6, unit = "in", res = 300)
## png
## 2
varPart.df <- as.data.frame(vp_df)
# sort genes based on variance explained by group
order.varPart.df <- varPart.df[order(varPart.df$RIN, decreasing = TRUE),]
head(order.varPart.df["RIN"], 10)
# sort genes based on variance explained by Race
order.varPart.df <- varPart.df[order(varPart.df$Race, decreasing = TRUE),]
head(order.varPart.df["Race"], 10)
age <- as.numeric(dge.filtered.norm$samples$Age)
RIN <- as.numeric(dge.filtered.norm$samples$RIN)
race <- as.factor(dge.filtered.norm$samples$Race)
group <- interaction(dge.filtered.norm$samples$TYPE)
design <- model.matrix(~ 0 + group)
colnames(design) <- c(control,condition)
design
## CONTROL LBD
## 1 0 1
## 2 1 0
## 3 1 0
## 4 0 1
## 5 0 1
## 6 0 1
## 7 1 0
## 8 0 1
## 9 0 1
## 10 0 1
## 11 0 1
## 12 0 1
## 13 0 1
## 14 0 1
## 15 0 1
## 16 0 1
## 17 0 1
## 18 0 1
## 19 0 1
## 20 0 1
## 21 1 0
## 22 0 1
## 23 0 1
## attr(,"assign")
## [1] 1 1
## attr(,"contrasts")
## attr(,"contrasts")$group
## [1] "contr.treatment"
# voom transform counts
v <- voomWithQualityWeights(dge.filtered.norm,
design,
plot = TRUE)
# save
path <- paste0("../../results/", tool, "/voom/", condition, "_gene_mean_var_weights_", sex_karyotype)
saveToPDF(paste0(path, ".pdf"), width = 6, height = 4)
## png
## 2
saveToPNG(paste0(path, ".png"), width = 6, height = 4, unit = "in", res = 300)
## png
## 2
# fits linear model for each gene given a series of arrays
fit <- lmFit(v, design)
# contrast design for differential expression
contrasts <- makeContrasts(
title = myContrasts, # myContrasts was user input from beginning
levels = colnames(design))
head(contrasts)
## Contrasts
## Levels LBD - CONTROL
## CONTROL -1
## LBD 1
# save contrast names
allComparisons <- colnames(contrasts)
allComparisons # check
## [1] "LBD - CONTROL"
# run contrast analysis
vfit <- contrasts.fit(fit, contrasts = contrasts)
# Compute differential expression based on the empirical Bayes moderation of the
# standard errors towards a common value.
veBayesFit <- eBayes(vfit)
plotSA(veBayesFit, main = "Final Model: Mean-variance Trend")
# save
path <- paste0("../../results/", tool, "/voom/", condition, "_gene_final_mean_var_", sex_karyotype)
saveToPDF(paste0(path, ".pdf"), width = 6, height = 4)
## png
## 2
saveToPNG(paste0(path, ".png"), width = 6, height = 4, unit = "in", res = 300)
## png
## 2
group_colors <- c("black", "red")[v$targets$TYPE]
names <- v$targets$NPID
plotMDS(
v,
top = 100,
labels = names,
cex = 1,
dim.plot = c(1,2),
plot = TRUE,
col = group_colors
)
title(expression('Top 100 Genes - Voom (Log'[2]~'CPM)'))
# save
# save
path <- paste0("../../results/", tool, "/MDS/", condition, "_gene_MDS_normalized_dim1&2_", sex_karyotype)
saveToPDF(paste0(path, ".pdf"), width = 5, height = 5)
## png
## 2
saveToPNG(paste0(path, ".png"), width = 4, height = 4, unit = "in", res = 300)
## png
## 2
plotMDS(
v,
top = 100,
labels = names,
cex = 1,
dim.plot = c(2,3),
plot = TRUE,
col = group_colors
)
title(expression('Top 100 Genes - Voom (Log'[2]~'CPM)'))
# save
# save
path <- paste0("../../results/", tool, "/MDS/", condition, "_gene_MDS_normalized_dim2&3_", sex_karyotype)
saveToPDF(paste0(path, ".pdf"), width = 5, height = 5)
## png
## 2
saveToPNG(paste0(path, ".png"), width = 4, height = 4, unit = "in", res = 300)
## png
## 2
saveRDS(v, file = paste0("../../rObjects/", condition, "_gene_voom", sex_karyotype, ".rds"))
Identify number of differential expressed genes.
pval <- 0.05
sumTable <-
summary(decideTests(
vfit, # object
adjust.method = "BH", # by default the method = "separate"
p.value = pval,
lfc = 0 # numeric, minimum absolute log2-fold change required
))
print(paste0(" FDRq < ", pval))
## [1] " FDRq < 0.05"
sumTable
## LBD - CONTROL
## Down 1
## NotSig 18388
## Up 0
coef <- 1
for (i in allComparisons) {
# p < 1, log2fc > 0
vTopTableAll <-
topTable(
veBayesFit,
coef = coef,
n = Inf,
p.value = 1,
lfc = 0
)
path <- paste("../../results/", tool, "/DEGs/", condition, "_gene_DEGs_FDRq1.00_", sex_karyotype, ".txt", sep = "")
write.table(
vTopTableAll,
path,
sep = "\t",
row.names = FALSE,
quote = FALSE
)
# p < 0.05, log2fc > 0
vTopTable1 <-
topTable(
veBayesFit,
coef = coef,
n = Inf,
p.value = 0.05,
lfc = 0
)
path <- paste("../../results/", tool, "/DEGs/", condition, "_gene_DEGs_FDRq0.05_", sex_karyotype, ".txt", sep = "")
write.table(
vTopTable1,
path,
sep = "\t",
row.names = FALSE,
quote = FALSE
)
# increment -----------------------------------------------------------------
coef <- coef + 1
}
Read and save table with all genes (FDRq = 1).
condition_vs_control <- read.table(
paste0("../../results/", tool, "/DEGs/", condition, "_gene_DEGs_FDRq1.00_", sex_karyotype, ".txt", sep = ""),
header = TRUE,
sep = "\t",
stringsAsFactors = FALSE)
saveRDS(condition_vs_control, file = paste0("../../rObjects/", condition, "_gene_table_", sex_karyotype, ".rds"))
Assign colors values based on FDRq cutoff of 0.2.
color_values <- vector()
max <- nrow(condition_vs_control)
for(i in 1:max){
if (condition_vs_control$adj.P.Val[i] < 0.2){
if (condition_vs_control$logFC[i] > 0){
color_values <- c(color_values, 1) # 1 when logFC > 0 and FDRq < 0.05
}
else if (condition_vs_control$logFC[i] < 0){
color_values <- c(color_values, 2) # 2 when logFC < 0 and FDRq < 0.05
}
}
else{
color_values <- c(color_values, 3) # 3 when FDRq >= 0.05
}
}
condition_vs_control$color_p0.05 <- factor(color_values)
Subset the top 10 up and down-regulated genes
up <- condition_vs_control[condition_vs_control$color_p0.05 == 1,]
up10 <- up[1:10,]
down <- condition_vs_control[condition_vs_control$color_p0.05 == 2,]
down <- subset(down, down$logFC < -1.5)
down10 <- down[1:7,]
hadjpval <- (-log10(max(
condition_vs_control$P.Value[condition_vs_control$adj.P.Val < 0.2],
na.rm=TRUE)))
p_vol <-
ggplot(data = condition_vs_control,
aes(x = logFC, # x-axis is logFC
y = -log10(P.Value), # y-axis will be -log10 of P.Value
color = color_p0.05)) + # color is based on factored color column
geom_point(alpha = 0.8, size = 1.7) + # create scatterplot, alpha makes points transparent
theme_bw() + # set color theme
theme(legend.position = "none") + # no legend
scale_color_manual(values = c("red", "blue","grey")) + # set factor colors
labs(
title = "", # no main title
x = expression(log[2](FC)), # x-axis title
y = expression(-log[10] ~ "(" ~ italic("p") ~ "-value)") # y-axis title
) +
theme(axis.title.x = element_text(size = 10),
axis.text.x = element_text(size = 10)) +
theme(axis.title.y = element_text(size = 10),
axis.text.y = element_text(size = 10)) +
geom_hline(yintercept = hadjpval, # horizontal line
colour = "#000000",
linetype = "dashed") +
ggtitle("LBD vs Control") +
theme(plot.title = element_text(size = 10)) +
geom_text_repel(data = up10,
aes(x = logFC, y= -log10(P.Value), label = gene_name),
color = "maroon",
fontface="italic",
size = 3,
max.overlaps = getOption("ggrepel.max.overlaps", default = 30)
) +
geom_text_repel(data = down10,
aes(x = logFC, y= -log10(P.Value), label = gene_name),
color = "navyblue",
fontface="italic",
size = 3,
max.overlaps = getOption("ggrepel.max.overlaps", default = 15)
) # +
# scale_y_continuous(breaks = seq(0,12,by=1), limits = c(0,12)) +
# scale_x_continuous(breaks = seq(-8,8,by=2), limits = c(-8,8))
p_vol
## Warning: Removed 9 rows containing missing values (geom_text_repel).
# save
path <- paste0("../../results/", tool, "/volcano/", condition, "_gene_volcano_FDRq0.05_", sex_karyotype)
saveToPDF(paste0(path, ".pdf"), width = 8, height = 6)
## png
## 2
saveToPNG(paste0(path, ".png"), width = 8, height = 6, unit = "in", res = 300)
## png
## 2
row.names(dge.filtered.norm$genes) <- make.names(dge.filtered.norm$genes$gene_name, unique = T)
glimmaVolcano(veBayesFit, dge = dge.filtered.norm, groups = dge.filtered.norm$samples$TYPE)